The objective of this notebook is to discover Quora insincere questions' topics, aka target = 1.

In [1]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import warnings, time, gc

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file

import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE

from wordcloud import WordCloud

np.random.seed(32)
color = sns.color_palette("Set2")
warnings.filterwarnings("ignore")
stop_words = set(stopwords.words("english"))
punctuations = string.punctuation
output_notebook()

%matplotlib inline

train = pd.read_csv("../input/train.csv")
Loading BokehJS ...
In [2]:
train.head()
Out[2]:
qid question_text target
0 00002165364db923c7e6 How did Quebec nationalists see their province... 0
1 000032939017120e6e44 Do you have an adopted dog, how would you enco... 0
2 0000412ca6e4628ce2cf Why does velocity affect time? Does velocity a... 0
3 000042bf85aa498cd78e How did Otto von Guericke used the Magdeburg h... 0
4 0000455dfa3e01eae3af Can I convert montra helicon D to a mountain b... 0
In [3]:
train.isna().sum()
Out[3]:
qid              0
question_text    0
target           0
dtype: int64

Target Distrinbution

In [4]:
target_count = train["target"].value_counts()

plt.figure(figsize = (8, 5))
ax = sns.barplot(target_count.index, target_count.values)
rects = ax.patches
labels = target_count.values
for rect, label in zip(rects, labels):
    ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() + 5,
           label, ha = "center", va = "bottom")
plt.show()

Question Length Distribution

In [5]:
train["quest_len"] = train["question_text"].apply(lambda x: len(x.split()))
In [6]:
sincere = train[train["target"] == 0]
insincere = train[train["target"] == 1]

plt.figure(figsize = (15, 8))
sns.distplot(sincere["quest_len"], hist = True, label = "sincere")
sns.distplot(insincere["quest_len"], hist = True, label = "insincere")
plt.legend(fontsize = 10)
plt.title("Questions Length Distribution by Class", fontsize = 12)
plt.show()

Data Cleaning

In [8]:
# Credit: https://www.kaggle.com/jagangupta/stop-the-s-toxic-comments-eda

lem = WordNetLemmatizer()
tokenizer = TweetTokenizer()

def clean_text(question):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    question = question.lower()
    #remove \n
    question = re.sub("\\n", "", question)
    #remove disteacting single quotes
    question = re.sub("\'", "", question)
    # remove new line characters
#     question = re.sub('s+', " ", question)
    
    #Split the sentences into words
    words = tokenizer.tokenize(question)
    
    # (')aphostophe  replacement (ie)   you're --> you are  
    # ( basic dictionary lookup : master dictionary present in a hidden block of code)
    words = [APPO[word] if word in APPO else word for word in words]
    words = [lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if w not in stop_words and w not in punctuations]

    clean_sent = " ".join(words)
    # remove any non alphanum, digit character
#     clean_sent = re.sub("\W+", " ", clean_sent)
#     clean_sent = re.sub("  ", " ", clean_sent)
    
    return clean_sent
In [9]:
sincere["clean_question_text"] = sincere["question_text"].apply(lambda question: clean_text(question))
insincere["clean_question_text"] = insincere["question_text"].apply(lambda question: clean_text(question))
In [10]:
insincere.head()
Out[10]:
qid question_text target quest_len clean_question_text
22 0000e91571b60c2fb487 Has the United States become the largest dicta... 1 11 unite state become largest dictatorship world
30 00013ceca3f624b09f42 Which babies are more sweeter to their parents... 1 15 baby sweeter parent dark skin baby light skin ...
110 0004a7fcb2bf73076489 If blacks support school choice and mandatory ... 1 15 black support school choice mandatory sentence...
114 00052793eaa287aff1e1 I am gay boy and I love my cousin (boy). He is... 1 34 gay boy love cousin boy sexy dont know hot wan...
115 000537213b01fd77b58a Which races have the smallest penis? 1 6 race smallest penis

Insincere Questions Topic Modeling

In [11]:
cv = CountVectorizer(min_df = 10,
                     max_features = 100000,
                     analyzer = "word",
                     ngram_range = (1, 2),
                     stop_words = "english",
                     token_pattern = '[a-zA-Z]')

count_vectors = cv.fit_transform(insincere["clean_question_text"])
In [12]:
# params = {"n_components": [5, 10, 20, 30, 40, 50]}

# lda_model = LatentDirichletAllocation(n_components = n_topics, 
#                                       # we choose a small n_components for time convenient
#                                       # will find a appropriate n_components later 
#                                       learning_method = "online",
#                                       batch_size = 128,
#                                       evaluate_every = -1,
#                                       max_iter = 20,
#                                       random_state = 32,
#                                       n_jobs = -1)

# model = GridSearchCV(lda_model, param_grid = params)
# model.fit(count_vectors)

# best_lda_model = model.best_estimator_
# best_lda_model

After applying Grid Search, we found the optimial n_components is between 5 to 10. In this case, we pick the 'mean' which is 8.

In [13]:
n_topics = 8
lda_model = LatentDirichletAllocation(n_components = n_topics, 
                                      learning_method = "online",
                                      batch_size = 128,
                                      evaluate_every = -1,
                                      max_iter = 20,
                                      random_state = 32,
                                      n_jobs = -1)

question_topics = lda_model.fit_transform(count_vectors)
temp = question_topics

To get a better LDA model, we need to maximize log likelihood and minimize perplexity.

In [14]:
print("Log Likelihood: {} \nPerplexity: {}".format(lda_model.score(count_vectors), 
                                                   lda_model.perplexity(count_vectors)))
Log Likelihood: -34145790.4709457 
Perplexity: 123.16328146685547
In [15]:
tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 32, n_iter = 500)
tsne_lda = tsne_model.fit_transform(question_topics)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 80810 samples in 0.084s...
[t-SNE] Computed neighbors for 80810 samples in 11.710s...
[t-SNE] Computed conditional probabilities for sample 1000 / 80810
[t-SNE] Computed conditional probabilities for sample 2000 / 80810
[t-SNE] Computed conditional probabilities for sample 3000 / 80810
[t-SNE] Computed conditional probabilities for sample 4000 / 80810
[t-SNE] Computed conditional probabilities for sample 5000 / 80810
[t-SNE] Computed conditional probabilities for sample 6000 / 80810
[t-SNE] Computed conditional probabilities for sample 7000 / 80810
[t-SNE] Computed conditional probabilities for sample 8000 / 80810
[t-SNE] Computed conditional probabilities for sample 9000 / 80810
[t-SNE] Computed conditional probabilities for sample 10000 / 80810
[t-SNE] Computed conditional probabilities for sample 11000 / 80810
[t-SNE] Computed conditional probabilities for sample 12000 / 80810
[t-SNE] Computed conditional probabilities for sample 13000 / 80810
[t-SNE] Computed conditional probabilities for sample 14000 / 80810
[t-SNE] Computed conditional probabilities for sample 15000 / 80810
[t-SNE] Computed conditional probabilities for sample 16000 / 80810
[t-SNE] Computed conditional probabilities for sample 17000 / 80810
[t-SNE] Computed conditional probabilities for sample 18000 / 80810
[t-SNE] Computed conditional probabilities for sample 19000 / 80810
[t-SNE] Computed conditional probabilities for sample 20000 / 80810
[t-SNE] Computed conditional probabilities for sample 21000 / 80810
[t-SNE] Computed conditional probabilities for sample 22000 / 80810
[t-SNE] Computed conditional probabilities for sample 23000 / 80810
[t-SNE] Computed conditional probabilities for sample 24000 / 80810
[t-SNE] Computed conditional probabilities for sample 25000 / 80810
[t-SNE] Computed conditional probabilities for sample 26000 / 80810
[t-SNE] Computed conditional probabilities for sample 27000 / 80810
[t-SNE] Computed conditional probabilities for sample 28000 / 80810
[t-SNE] Computed conditional probabilities for sample 29000 / 80810
[t-SNE] Computed conditional probabilities for sample 30000 / 80810
[t-SNE] Computed conditional probabilities for sample 31000 / 80810
[t-SNE] Computed conditional probabilities for sample 32000 / 80810
[t-SNE] Computed conditional probabilities for sample 33000 / 80810
[t-SNE] Computed conditional probabilities for sample 34000 / 80810
[t-SNE] Computed conditional probabilities for sample 35000 / 80810
[t-SNE] Computed conditional probabilities for sample 36000 / 80810
[t-SNE] Computed conditional probabilities for sample 37000 / 80810
[t-SNE] Computed conditional probabilities for sample 38000 / 80810
[t-SNE] Computed conditional probabilities for sample 39000 / 80810
[t-SNE] Computed conditional probabilities for sample 40000 / 80810
[t-SNE] Computed conditional probabilities for sample 41000 / 80810
[t-SNE] Computed conditional probabilities for sample 42000 / 80810
[t-SNE] Computed conditional probabilities for sample 43000 / 80810
[t-SNE] Computed conditional probabilities for sample 44000 / 80810
[t-SNE] Computed conditional probabilities for sample 45000 / 80810
[t-SNE] Computed conditional probabilities for sample 46000 / 80810
[t-SNE] Computed conditional probabilities for sample 47000 / 80810
[t-SNE] Computed conditional probabilities for sample 48000 / 80810
[t-SNE] Computed conditional probabilities for sample 49000 / 80810
[t-SNE] Computed conditional probabilities for sample 50000 / 80810
[t-SNE] Computed conditional probabilities for sample 51000 / 80810
[t-SNE] Computed conditional probabilities for sample 52000 / 80810
[t-SNE] Computed conditional probabilities for sample 53000 / 80810
[t-SNE] Computed conditional probabilities for sample 54000 / 80810
[t-SNE] Computed conditional probabilities for sample 55000 / 80810
[t-SNE] Computed conditional probabilities for sample 56000 / 80810
[t-SNE] Computed conditional probabilities for sample 57000 / 80810
[t-SNE] Computed conditional probabilities for sample 58000 / 80810
[t-SNE] Computed conditional probabilities for sample 59000 / 80810
[t-SNE] Computed conditional probabilities for sample 60000 / 80810
[t-SNE] Computed conditional probabilities for sample 61000 / 80810
[t-SNE] Computed conditional probabilities for sample 62000 / 80810
[t-SNE] Computed conditional probabilities for sample 63000 / 80810
[t-SNE] Computed conditional probabilities for sample 64000 / 80810
[t-SNE] Computed conditional probabilities for sample 65000 / 80810
[t-SNE] Computed conditional probabilities for sample 66000 / 80810
[t-SNE] Computed conditional probabilities for sample 67000 / 80810
[t-SNE] Computed conditional probabilities for sample 68000 / 80810
[t-SNE] Computed conditional probabilities for sample 69000 / 80810
[t-SNE] Computed conditional probabilities for sample 70000 / 80810
[t-SNE] Computed conditional probabilities for sample 71000 / 80810
[t-SNE] Computed conditional probabilities for sample 72000 / 80810
[t-SNE] Computed conditional probabilities for sample 73000 / 80810
[t-SNE] Computed conditional probabilities for sample 74000 / 80810
[t-SNE] Computed conditional probabilities for sample 75000 / 80810
[t-SNE] Computed conditional probabilities for sample 76000 / 80810
[t-SNE] Computed conditional probabilities for sample 77000 / 80810
[t-SNE] Computed conditional probabilities for sample 78000 / 80810
[t-SNE] Computed conditional probabilities for sample 79000 / 80810
[t-SNE] Computed conditional probabilities for sample 80000 / 80810
[t-SNE] Computed conditional probabilities for sample 80810 / 80810
[t-SNE] Mean sigma: 0.011471
[t-SNE] KL divergence after 250 iterations with early exaggeration: 99.605843
[t-SNE] KL divergence after 500 iterations: 3.862653
In [16]:
question_topics = np.matrix(question_topics)
doc_topics = question_topics/question_topics.sum(axis = 1)

lda_keys = []
for i, tweet in enumerate(insincere["question_text"]):
    lda_keys += [doc_topics[i].argmax()]
    
tsne_lda_df = pd.DataFrame(tsne_lda, columns = ["x", "y"])
tsne_lda_df["qid"] = insincere["qid"].values
tsne_lda_df["question"] = insincere["question_text"].values
tsne_lda_df["topics"] = lda_keys
tsne_lda_df["topics"] = tsne_lda_df["topics"].map(int)
In [17]:
import random

def generate_color():
    color = "#{:02x}{:02x}{:02x}".format(*map(lambda x: random.randint(0, 255), range(3)))
    return color
In [18]:
colormap = np.array([generate_color() for t in range(n_topics)])
In [19]:
plot_lda = bp.figure(plot_width = 700, plot_height = 600, 
                    title = "LDA topics of Quora Questions",
                    tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                    x_axis_type = None, y_axis_type = None, min_border = 1)

source = ColumnDataSource(data = dict(x = tsne_lda_df["x"], y = tsne_lda_df["y"],
                         color = colormap[lda_keys],
                         qid = tsne_lda_df["qid"],
                         question = tsne_lda_df["question"],
                         topics = tsne_lda_df["topics"]))

plot_lda.scatter(x = "x", y = "y", color = "color", source = source)
hover = plot_lda.select(dict(type = HoverTool))
hover.tooltips = {"qid": "@qid","question": "@question", "topics": "@topics"}
show(plot_lda)

Although we can see some patterns in the visualization from above, the graph is difficult to interpret. The very reason for that is our model is unable to confidently assign a topic to every questions. This means that there are questions being assigned a low probability to a probable topic. To filter out such questions, we simply add a threshold factor.

Topic Probability => 0.5

In [20]:
threshold = 0.5
idx = np.amax(temp, axis = 1) >= threshold
question_topics = temp[idx]
In [21]:
tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 32, n_iter = 500)
tsne_lda2 = tsne_model.fit_transform(question_topics)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 33192 samples in 0.026s...
[t-SNE] Computed neighbors for 33192 samples in 2.149s...
[t-SNE] Computed conditional probabilities for sample 1000 / 33192
[t-SNE] Computed conditional probabilities for sample 2000 / 33192
[t-SNE] Computed conditional probabilities for sample 3000 / 33192
[t-SNE] Computed conditional probabilities for sample 4000 / 33192
[t-SNE] Computed conditional probabilities for sample 5000 / 33192
[t-SNE] Computed conditional probabilities for sample 6000 / 33192
[t-SNE] Computed conditional probabilities for sample 7000 / 33192
[t-SNE] Computed conditional probabilities for sample 8000 / 33192
[t-SNE] Computed conditional probabilities for sample 9000 / 33192
[t-SNE] Computed conditional probabilities for sample 10000 / 33192
[t-SNE] Computed conditional probabilities for sample 11000 / 33192
[t-SNE] Computed conditional probabilities for sample 12000 / 33192
[t-SNE] Computed conditional probabilities for sample 13000 / 33192
[t-SNE] Computed conditional probabilities for sample 14000 / 33192
[t-SNE] Computed conditional probabilities for sample 15000 / 33192
[t-SNE] Computed conditional probabilities for sample 16000 / 33192
[t-SNE] Computed conditional probabilities for sample 17000 / 33192
[t-SNE] Computed conditional probabilities for sample 18000 / 33192
[t-SNE] Computed conditional probabilities for sample 19000 / 33192
[t-SNE] Computed conditional probabilities for sample 20000 / 33192
[t-SNE] Computed conditional probabilities for sample 21000 / 33192
[t-SNE] Computed conditional probabilities for sample 22000 / 33192
[t-SNE] Computed conditional probabilities for sample 23000 / 33192
[t-SNE] Computed conditional probabilities for sample 24000 / 33192
[t-SNE] Computed conditional probabilities for sample 25000 / 33192
[t-SNE] Computed conditional probabilities for sample 26000 / 33192
[t-SNE] Computed conditional probabilities for sample 27000 / 33192
[t-SNE] Computed conditional probabilities for sample 28000 / 33192
[t-SNE] Computed conditional probabilities for sample 29000 / 33192
[t-SNE] Computed conditional probabilities for sample 30000 / 33192
[t-SNE] Computed conditional probabilities for sample 31000 / 33192
[t-SNE] Computed conditional probabilities for sample 32000 / 33192
[t-SNE] Computed conditional probabilities for sample 33000 / 33192
[t-SNE] Computed conditional probabilities for sample 33192 / 33192
[t-SNE] Mean sigma: 0.007594
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.625862
[t-SNE] KL divergence after 500 iterations: 2.305048
In [25]:
new_insincere = insincere[["qid", "question_text"]].copy()
new_insincere = new_insincere[idx]
In [26]:
question_topics = np.matrix(question_topics)
doc_topics = question_topics/question_topics.sum(axis = 1)

lda_keys = []
for i, tweet in enumerate(new_insincere["question_text"]):
    lda_keys += [doc_topics[i].argmax()]
    
tsne_lda_df2 = pd.DataFrame(tsne_lda2, columns = ["x", "y"])
tsne_lda_df2["qid"] = new_insincere["qid"].values
tsne_lda_df2["question"] = new_insincere["question_text"].values
tsne_lda_df2["topics"] = lda_keys
tsne_lda_df2["topics"] = tsne_lda_df2["topics"].map(int)
In [24]:
plot_lda = bp.figure(plot_width = 700, plot_height = 600, 
                    title = "LDA topics of Quora Questions",
                    tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                    x_axis_type = None, y_axis_type = None, min_border = 1)

source = ColumnDataSource(data = dict(x = tsne_lda_df2["x"], y = tsne_lda_df2["y"],
                         color = colormap[lda_keys],
                         qid = tsne_lda_df2["qid"],
                         question = tsne_lda_df2["question"],
                         topics = tsne_lda_df2["topics"]))

plot_lda.scatter(x = "x", y = "y", color = "color", source = source)
hover = plot_lda.select(dict(type = HoverTool))
hover.tooltips = {"qid": "@qid", "question": "@question", "topics": "@topics"}
show(plot_lda)

We get a much better visualization after using probability threshold.

Topic Probability < 0.5

In [27]:
idx = np.amax(temp, axis = 1) < threshold
question_topics = temp[idx]
In [28]:
tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 32, n_iter = 500)
tsne_lda3 = tsne_model.fit_transform(question_topics)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 47618 samples in 0.034s...
[t-SNE] Computed neighbors for 47618 samples in 6.280s...
[t-SNE] Computed conditional probabilities for sample 1000 / 47618
[t-SNE] Computed conditional probabilities for sample 2000 / 47618
[t-SNE] Computed conditional probabilities for sample 3000 / 47618
[t-SNE] Computed conditional probabilities for sample 4000 / 47618
[t-SNE] Computed conditional probabilities for sample 5000 / 47618
[t-SNE] Computed conditional probabilities for sample 6000 / 47618
[t-SNE] Computed conditional probabilities for sample 7000 / 47618
[t-SNE] Computed conditional probabilities for sample 8000 / 47618
[t-SNE] Computed conditional probabilities for sample 9000 / 47618
[t-SNE] Computed conditional probabilities for sample 10000 / 47618
[t-SNE] Computed conditional probabilities for sample 11000 / 47618
[t-SNE] Computed conditional probabilities for sample 12000 / 47618
[t-SNE] Computed conditional probabilities for sample 13000 / 47618
[t-SNE] Computed conditional probabilities for sample 14000 / 47618
[t-SNE] Computed conditional probabilities for sample 15000 / 47618
[t-SNE] Computed conditional probabilities for sample 16000 / 47618
[t-SNE] Computed conditional probabilities for sample 17000 / 47618
[t-SNE] Computed conditional probabilities for sample 18000 / 47618
[t-SNE] Computed conditional probabilities for sample 19000 / 47618
[t-SNE] Computed conditional probabilities for sample 20000 / 47618
[t-SNE] Computed conditional probabilities for sample 21000 / 47618
[t-SNE] Computed conditional probabilities for sample 22000 / 47618
[t-SNE] Computed conditional probabilities for sample 23000 / 47618
[t-SNE] Computed conditional probabilities for sample 24000 / 47618
[t-SNE] Computed conditional probabilities for sample 25000 / 47618
[t-SNE] Computed conditional probabilities for sample 26000 / 47618
[t-SNE] Computed conditional probabilities for sample 27000 / 47618
[t-SNE] Computed conditional probabilities for sample 28000 / 47618
[t-SNE] Computed conditional probabilities for sample 29000 / 47618
[t-SNE] Computed conditional probabilities for sample 30000 / 47618
[t-SNE] Computed conditional probabilities for sample 31000 / 47618
[t-SNE] Computed conditional probabilities for sample 32000 / 47618
[t-SNE] Computed conditional probabilities for sample 33000 / 47618
[t-SNE] Computed conditional probabilities for sample 34000 / 47618
[t-SNE] Computed conditional probabilities for sample 35000 / 47618
[t-SNE] Computed conditional probabilities for sample 36000 / 47618
[t-SNE] Computed conditional probabilities for sample 37000 / 47618
[t-SNE] Computed conditional probabilities for sample 38000 / 47618
[t-SNE] Computed conditional probabilities for sample 39000 / 47618
[t-SNE] Computed conditional probabilities for sample 40000 / 47618
[t-SNE] Computed conditional probabilities for sample 41000 / 47618
[t-SNE] Computed conditional probabilities for sample 42000 / 47618
[t-SNE] Computed conditional probabilities for sample 43000 / 47618
[t-SNE] Computed conditional probabilities for sample 44000 / 47618
[t-SNE] Computed conditional probabilities for sample 45000 / 47618
[t-SNE] Computed conditional probabilities for sample 46000 / 47618
[t-SNE] Computed conditional probabilities for sample 47000 / 47618
[t-SNE] Computed conditional probabilities for sample 47618 / 47618
[t-SNE] Mean sigma: 0.037428
[t-SNE] KL divergence after 250 iterations with early exaggeration: 99.665627
[t-SNE] KL divergence after 500 iterations: 3.397086
In [29]:
new_insincere2 = insincere[["qid", "question_text"]].copy()
new_insincere2 = new_insincere2[idx]
In [30]:
question_topics = np.matrix(question_topics)
doc_topics = question_topics/question_topics.sum(axis = 1)

lda_keys = []
for i, tweet in enumerate(new_insincere2["question_text"]):
    lda_keys += [doc_topics[i].argmax()]
    
tsne_lda_df3 = pd.DataFrame(tsne_lda3, columns = ["x", "y"])
tsne_lda_df3["qid"] = new_insincere2["qid"].values
tsne_lda_df3["question"] = new_insincere2["question_text"].values
tsne_lda_df3["topics"] = lda_keys
tsne_lda_df3["topics"] = tsne_lda_df2["topics"].map(int)
In [31]:
plot_lda = bp.figure(plot_width = 700, plot_height = 600, 
                    title = "LDA topics of Quora Questions",
                    tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                    x_axis_type = None, y_axis_type = None, min_border = 1)

source = ColumnDataSource(data = dict(x = tsne_lda_df3["x"], y = tsne_lda_df3["y"],
                         color = colormap[lda_keys],
                         qid = tsne_lda_df3["qid"],
                         question = tsne_lda_df3["question"],
                         topics = tsne_lda_df3["topics"]))

plot_lda.scatter(x = "x", y = "y", color = "color", source = source)
hover = plot_lda.select(dict(type = HoverTool))
hover.tooltips = {"qid": "@qid", "question": "@question", "topics": "@topics"}
show(plot_lda)

Insincere Topic Wordcloud

In [32]:
def create_wordcloud(i, data):
#     plt.subplot(int("52{}".format(ax+1)))
    wc =  WordCloud(max_words = 1000, stopwords = stop_words)
    wc.generate(" ".join(data))
    ax[int(i/2)][i%2].axis("off")
    ax[int(i/2)][i%2].set_title("Words Frequented in Topic {}".format(i), fontsize = 15)
    ax[int(i/2)][i%2].imshow(wc)
    
fig, ax = plt.subplots(4, 2, figsize = (25, 25))
for i in range(n_topics):
    text = tsne_lda_df[tsne_lda_df["topics"] == int(i)]["question"]
    create_wordcloud(int(i), text)

Topic Network

In [34]:
cor = squareform(pdist(tsne_lda2[:100], metric = "euclidean"))
In [33]:
import networkx as nx
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform

cor = squareform(pdist(tsne_lda2, metric = "euclidean"))
In [35]:
labels = {}

for l, i in enumerate(tsne_lda_df2["qid"]):
    labels[l] = i
In [36]:
G = nx.Graph()

for i in range(cor.shape[0]):
    for j in range(cor.shape[1]):
        if i == j:
            G.add_edge(i, j, weight = 0)
        else:
            G.add_edge(i, j, weight = 1.0/cor[i, j])
            
G = nx.relabel_nodes(G, labels)
            
edges = [(i, j) for i, j, w in G.edges(data = True) if w["weight"] > 0.8]
edge_weight = dict([((u, v, ), int(d["weight"])) for u, v, d in G.edges(data = True)])

pos = nx.spring_layout(G)

plt.figure(figsize = (10, 8))
nx.draw_networkx_nodes(G, pos, node_size = 100, alpha = 0.5)
nx.draw_networkx_edges(G, pos, edgelist = edges, width = 1)
nx.draw_networkx_labels(G, pos, font_size = 8, font_family = "sans-serif")
plt.show()